#Just list everything that might be used
import pandas as pd
import matplotlib.pyplot as plt
import pandas
import numpy as np
from zipfile import ZipFile
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import warnings;
warnings.simplefilter('ignore')
import plotly.express as px
import plotly.graph_objects as go
#get data
df = pd.read_csv('zameen-updated.csv')
display(df.head(3))
print('jumlah kolom kosong')
display(df.isna().sum())
print("datasource: kaggle")
| property_id | location_id | page_url | property_type | price | location | city | province_name | latitude | longitude | baths | area | purpose | bedrooms | date_added | agency | agent | Area Type | Area Size | Area Category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 237062 | 3325 | https://www.zameen.com/Property/g_10_g_10_2_gr... | Flat | 10000000 | G-10 | Islamabad | Islamabad Capital | 33.679890 | 73.012640 | 2 | 4 Marla | For Sale | 2 | 02-04-2019 | NaN | NaN | Marla | 4.0 | 0-5 Marla |
| 1 | 346905 | 3236 | https://www.zameen.com/Property/e_11_2_service... | Flat | 6900000 | E-11 | Islamabad | Islamabad Capital | 33.700993 | 72.971492 | 3 | 5.6 Marla | For Sale | 3 | 05-04-2019 | NaN | NaN | Marla | 5.6 | 5-10 Marla |
| 2 | 386513 | 764 | https://www.zameen.com/Property/islamabad_g_15... | House | 16500000 | G-15 | Islamabad | Islamabad Capital | 33.631486 | 72.926559 | 6 | 8 Marla | For Sale | 5 | 07-17-2019 | NaN | NaN | Marla | 8.0 | 5-10 Marla |
jumlah kolom kosong
property_id 0 location_id 0 page_url 0 property_type 0 price 0 location 0 city 0 province_name 0 latitude 0 longitude 0 baths 0 area 0 purpose 0 bedrooms 0 date_added 0 agency 44071 agent 44072 Area Type 0 Area Size 0 Area Category 0 dtype: int64
datasource: kaggle
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 168446 entries, 0 to 168445 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 property_id 168446 non-null int64 1 location_id 168446 non-null int64 2 page_url 168446 non-null object 3 property_type 168446 non-null object 4 price 168446 non-null int64 5 location 168446 non-null object 6 city 168446 non-null object 7 province_name 168446 non-null object 8 latitude 168446 non-null float64 9 longitude 168446 non-null float64 10 baths 168446 non-null int64 11 area 168446 non-null object 12 purpose 168446 non-null object 13 bedrooms 168446 non-null int64 14 date_added 168446 non-null object 15 agency 124375 non-null object 16 agent 124374 non-null object 17 Area Type 168446 non-null object 18 Area Size 168446 non-null float64 19 Area Category 168446 non-null object dtypes: float64(3), int64(5), object(12) memory usage: 25.7+ MB
#filling NaN
df['agency'] = df['agency'].fillna('unknown')
df['agent'] = df['agent'].fillna('unknown')
df.head(3)
| property_id | location_id | page_url | property_type | price | location | city | province_name | latitude | longitude | baths | area | purpose | bedrooms | date_added | agency | agent | Area Type | Area Size | Area Category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 237062 | 3325 | https://www.zameen.com/Property/g_10_g_10_2_gr... | Flat | 10000000 | G-10 | Islamabad | Islamabad Capital | 33.679890 | 73.012640 | 2 | 4 Marla | For Sale | 2 | 02-04-2019 | unknown | unknown | Marla | 4.0 | 0-5 Marla |
| 1 | 346905 | 3236 | https://www.zameen.com/Property/e_11_2_service... | Flat | 6900000 | E-11 | Islamabad | Islamabad Capital | 33.700993 | 72.971492 | 3 | 5.6 Marla | For Sale | 3 | 05-04-2019 | unknown | unknown | Marla | 5.6 | 5-10 Marla |
| 2 | 386513 | 764 | https://www.zameen.com/Property/islamabad_g_15... | House | 16500000 | G-15 | Islamabad | Islamabad Capital | 33.631486 | 72.926559 | 6 | 8 Marla | For Sale | 5 | 07-17-2019 | unknown | unknown | Marla | 8.0 | 5-10 Marla |
#check if there's duplicated datas
df.duplicated().sum()
0
#country code, maybe this'll useful soon? if not its alright
df["code"] = "PK"
df.head(3)
| property_id | location_id | page_url | property_type | price | location | city | province_name | latitude | longitude | ... | area | purpose | bedrooms | date_added | agency | agent | Area Type | Area Size | Area Category | code | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 237062 | 3325 | https://www.zameen.com/Property/g_10_g_10_2_gr... | Flat | 10000000 | G-10 | Islamabad | Islamabad Capital | 33.679890 | 73.012640 | ... | 4 Marla | For Sale | 2 | 02-04-2019 | unknown | unknown | Marla | 4.0 | 0-5 Marla | PK |
| 1 | 346905 | 3236 | https://www.zameen.com/Property/e_11_2_service... | Flat | 6900000 | E-11 | Islamabad | Islamabad Capital | 33.700993 | 72.971492 | ... | 5.6 Marla | For Sale | 3 | 05-04-2019 | unknown | unknown | Marla | 5.6 | 5-10 Marla | PK |
| 2 | 386513 | 764 | https://www.zameen.com/Property/islamabad_g_15... | House | 16500000 | G-15 | Islamabad | Islamabad Capital | 33.631486 | 72.926559 | ... | 8 Marla | For Sale | 5 | 07-17-2019 | unknown | unknown | Marla | 8.0 | 5-10 Marla | PK |
3 rows × 21 columns
ax = sns.boxplot(data = df['price'])
ax = sns.boxplot(data = df[['baths', 'bedrooms']])
ax = sns.boxplot(data = df[['Area Size']])
#making function to replace outlier
def outlier (x):
sorted(x)
q1, q3 = x.quantile([0.25, 0.75])
IQR = q3 - q1
lwr_bound = q1 - (1.5*IQR)
upr_bound = q3 + (1.5*IQR)
return lwr_bound, upr_bound
low, high = outlier(df['price'])
low2, high2 = outlier(df['baths'])
low3, high3 = outlier(df['bedrooms'])
low4, high4 = outlier(df['Area Size'])
*this can be used too def cap_data(df): for col in df.columns: print("capping the ",col) if (((df[col].dtype)=='float64') | ((df[col].dtype)=='int64')): percentiles = df[col].quantile([0.01,0.99]).values df[col][df[col] <= percentiles[0]] = percentiles[0] df[col][df[col] >= percentiles[1]] = percentiles[1] else: df[col]=df[col] return df
final_df=cap_data(df)
#replacing outlier with upper bound and lower bound value
df['price'] = np.where(df['price']>high, high, df['price'])
df['price'] = np.where(df['price']<low, low, df['price'])
df['baths'] = np.where(df['baths']>high2, high2, df['baths'])
df['baths'] = np.where(df['baths']<low2, low2, df['baths'])
df['bedrooms'] = np.where(df['bedrooms']>high3, high3, df['bedrooms'])
df['bedrooms'] = np.where(df['bedrooms']<low3, low3, df['bedrooms'])
df['Area Size'] = np.where(df['Area Size']>high4, high4, df['Area Size'])
df['Area Size'] = np.where(df['Area Size']<low4, low4, df['Area Size'])
ax = sns.boxplot(data = df['price'])
ax = sns.boxplot(data = df[['baths', 'bedrooms']])
ax = sns.boxplot(data = df[['Area Size']])
#price distribution
plt.figure(figsize=(10,5))
plt.hist(df['price'], bins=70, range=(1,2500000), color='green')
plt.title('Distribusi Harga Properti di Pakistan',fontsize=19, color='blue')
plt.xlabel('harga (ribu)', fontsize = 12, color='red')
plt.ylabel('Jumlah rumah',fontsize = 12, color='red')
plt.xlim(xmin=0,xmax=800000)
labels, locations = plt.xticks()
plt.xticks(labels, (labels/1000).astype(int))
plt.show()
Most houses/property have price less than 100000
plt.clf()
df.groupby('city')['price'].mean().sort_values(ascending=False).plot(kind='bar', color='blue', figsize=(8,5))
plt.title('Harga Properti per-Kota di Pakistan', loc='center',pad=30, fontsize=20, color='blue')
plt.xlabel('Kota', fontsize = 15, color='red')
plt.ylabel('Harga (seratus ribu)', fontsize = 15, color='red')
plt.ylim(ymin=0)
labels, locations = plt.yticks()
plt.yticks(labels, (labels/100000).astype(int))
plt.xticks(rotation=0)
plt.show()
Lahore City have high average house/property price.
plt.clf()
df.groupby('property_type')['price'].mean().sort_values(ascending=False).plot(kind='bar', color='blue', figsize=(8,5))
plt.title('Harga Properti per-Tipe di Pakistan', loc='center',pad=30, fontsize=20, color='blue')
plt.xlabel('Tipe Properti', fontsize = 15, color='red')
plt.ylabel('Harga (seratus ribu)', fontsize = 15, color='red')
plt.ylim(ymin=0)
labels, locations = plt.yticks()
plt.yticks(labels, (labels/100000).astype(int))
plt.xticks(rotation=20)
plt.show()
Farm House have the expensive price than other house/property type.
df.groupby(['city', 'property_type'])['property_id'].count().unstack().plot(kind='bar', figsize=(8,5))
plt.title('Jumlah Properti per-Tipe dan Kota di Pakistan',loc='center',pad=30, fontsize=20, color='blue')
plt.xlabel('Kota', fontsize = 12, color='red')
plt.ylabel('Jumlah (ribu)',fontsize = 12, color='red')
plt.legend(bbox_to_anchor=(1, 1), shadow=True, title='tipe')
plt.ylim(ymin=0)
labels, locations = plt.yticks()
plt.yticks(labels, (labels/1000).astype(int))
plt.xticks(rotation=30)
plt.tight_layout()
plt.show()
df.groupby(['city', 'property_type'])['Area Size'].mean().unstack().plot(kind='bar', figsize=(8,5))
plt.title('Luas Rumah per-Tipe dan Kota di Pakistan',loc='center',pad=30, fontsize=20, color='blue')
plt.xlabel('Kota', fontsize = 12, color='red')
plt.ylabel('Luas',fontsize = 12, color='red')
plt.legend(bbox_to_anchor=(1, 1), shadow=True, title='Tipe')
plt.ylim(ymin=0)
labels, locations = plt.yticks()
plt.yticks(labels, (labels).astype(int))
plt.xticks(rotation=30)
plt.tight_layout()
plt.show()
plt.clf()
df.groupby('property_type')['property_id'].count().sort_values(ascending=False).plot(kind='bar', color='blue', figsize=(8,5))
plt.title('Jumlah Properti Berdasarkan Tipenya di Pakistan', loc='center',pad=30, fontsize=20, color='blue')
plt.xlabel('Tipe Properti', fontsize = 15, color='red')
plt.ylabel('Jumlah', fontsize = 15, color='red')
plt.ylim(ymin=0)
labels, locations = plt.yticks()
plt.yticks(labels, (labels/100000).astype(int))
plt.xticks(rotation=20)
plt.show()
House is the most property in Pakistan
df.columns
Index(['property_id', 'location_id', 'page_url', 'property_type', 'price',
'location', 'city', 'province_name', 'latitude', 'longitude', 'baths',
'area', 'purpose', 'bedrooms', 'date_added', 'agency', 'agent',
'Area Type', 'Area Size', 'Area Category', 'code'],
dtype='object')
df2 = df[['price', 'baths', 'bedrooms', 'Area Size', 'property_type']]
plt.figure(figsize=(20,24))
sns.set_theme('notebook', style='white')
sns.set_palette("bright")
g = sns.pairplot(df2, hue='property_type', diag_kind="hist", corner=True, palette = 'bright')
plt.show()
<Figure size 1440x1728 with 0 Axes>
df3 = df[(df['purpose']=='For Sale')]
df4 = df[(df['purpose']=='For Rent')]
import plotly.graph_objects as go
import pandas as pd
fig = px.scatter_mapbox(df3,
lat = df3['latitude'],
lon = df3['longitude'],
center={'lat': 31.46500, 'lon': 73.76000},
zoom = 8,
color = df3['price'],
size = df3['Area Size'],
color_continuous_scale=px.colors.cyclical.IceFire,
width = 720,
height = 500,
title = 'Pakistan House Price Scatter Map (for sale)',
hover_name="city"
#labels = {'City':'city'}
#text=df["city"].astype(str),
#popup={'City':'city', 'Price': 'price', 'Size': 'Area Size', 'lat':'latitude', 'lon':'longitude'}
)
fig.update_layout(mapbox_style = 'open-street-map')
#fig.update_layout(coloraxis_colorscale='Viridis')
fig.update_layout(margin = {'r' : 0, 't': 50, 'l' : 0, 'b': 0})
fig.show()
print('The larger the area size, the larger the point size (in map)')
The larger the area size, the larger the point size (in map)
fig = px.scatter_mapbox(df4,
lat = df4['latitude'],
lon = df4['longitude'],
center={'lat': 31.46500, 'lon': 73.76000},
zoom = 8,
color = df4['price'],
size = df4['Area Size'],
color_continuous_scale=px.colors.cyclical.IceFire,
width = 720,
height = 500,
title = 'Pakistan House Price Scatter Map (for rent)',
hover_name="city"
#labels = {'City':'city'}
#text=df["city"].astype(str),
#popup={'City':'city', 'Price': 'price', 'Size': 'Area Size', 'lat':'latitude', 'lon':'longitude'}
)
fig.update_layout(mapbox_style = 'open-street-map')
#fig.update_layout(coloraxis_colorscale='Viridis')
fig.update_layout(margin = {'r' : 0, 't': 50, 'l' : 0, 'b': 0})
fig.show()
print('The larger the area size, the larger the point size (in map)')
The larger the area size, the larger the point size (in map)
from this map, I detect some mistake in the data. Whether its the coordinates or the city name.
fig = px.scatter_mapbox(df,
lat = df['latitude'],
lon = df['longitude'],
center={'lat': 31.46500, 'lon': 73.76000},
zoom = 8,
color = df['property_type'],
#size = df4['price'],
#color_continuous_scale=px.colors.cyclical.IceFire,
width = 720,
height = 500,
title = 'Pakistan Property Type Scatter Map',
hover_name="city"
#labels = {'City':'city'}
#text=df["city"].astype(str),
#popup={'City':'city', 'Price': 'price', 'Size': 'Area Size', 'lat':'latitude', 'lon':'longitude'}
)
fig.update_layout(mapbox_style = 'open-street-map')
fig.update_layout(coloraxis_colorscale='Viridis')
fig.update_layout(margin = {'r' : 0, 't': 50, 'l' : 0, 'b': 0})
fig.show()
print('plot complete and move up')
plot complete and move up
From maps above and visualization before, I can conclude why Lahore Have the highest property price. Its because this city has few properties for rent and many properties for sale and have lots of houses and a few flats, upper/lower portion, and room. Maybe Property price influenced by other variables that not listed in the data too.
df5 = df.drop(['property_id','location_id', 'page_url', 'location', 'latitude','longitude','area','date_added','agency','agent','Area Category'],axis="columns")
df5.head()
| property_type | price | city | province_name | baths | purpose | bedrooms | Area Type | Area Size | code | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Flat | 10000000.0 | Islamabad | Islamabad Capital | 2.0 | For Sale | 2.0 | Marla | 4.0 | PK |
| 1 | Flat | 6900000.0 | Islamabad | Islamabad Capital | 3.0 | For Sale | 3.0 | Marla | 5.6 | PK |
| 2 | House | 16500000.0 | Islamabad | Islamabad Capital | 6.0 | For Sale | 5.0 | Marla | 8.0 | PK |
| 3 | House | 43500000.0 | Islamabad | Islamabad Capital | 4.0 | For Sale | 4.0 | Kanal | 2.0 | PK |
| 4 | House | 7000000.0 | Islamabad | Islamabad Capital | 3.0 | For Sale | 3.0 | Marla | 8.0 | PK |
# split data 1
x = df5[['property_type', 'city', 'purpose', 'Area Size', 'baths', 'bedrooms']]
y = df5["price"]
# Apply logarithmic transformation to the target variable
y_log = np.log1p(y)
# split data 2
x_train, x_test, y_train, y_test = train_test_split(x, y_log, test_size=0.2, random_state=0)
# Scaling
scaler = StandardScaler()
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = scaler.transform(y_test.values.reshape(-1, 1))
# Preprocessing the input features
transformer = ColumnTransformer(transformers=[
('tnf1', OrdinalEncoder(categories=[['For Rent', 'For Sale']]), ['purpose']),
('tnf2', OneHotEncoder(sparse=False, drop='first'), ['property_type', 'city']),
('tnf3', 'passthrough', ['baths', 'bedrooms'])
], remainder='passthrough')
x_train_transformed = transformer.fit_transform(x_train)
x_test_transformed = transformer.transform(x_test)
# algoritma linear regression
lr = LinearRegression()
lr.fit(x_train_transformed, y_train_scaled)
# prediksi
y_pred_scaled = lr.predict(x_test_transformed)
# kembaliin y_pred!
y_pred = np.expm1(scaler.inverse_transform(y_pred_scaled))
print(y_pred)
[[10513225.22113776] [ 5735409.93628062] [ 7541761.94064282] ... [19276297.22845063] [ 138985.19122532] [ 7399879.90387523]]
#validation
mean_absolute_error(y_test_scaled, y_pred_scaled)
0.20625226567860655
# Buat DataFrame baru dengan ciri yang ingin diprediksi sendiri
new_data = pd.DataFrame({
'property_type': ['Flat'],
'city': ['Islamabad'],
'purpose': ['For Sale'],
'Area Size': [2],
'baths': [3],
'bedrooms': [3]
})
# Transformasikan data baru menggunakan transformer yang telah ada
new_data_transformed = transformer.transform(new_data)
#prediksi dengan model
y_pred_scaled = lr.predict(new_data_transformed)
# Kembalikan nilai prediksi ke skala semula
y_pred = np.expm1(scaler.inverse_transform(y_pred_scaled))
# Cetak prediksi harga rumah
print("prediksi harga rumah:", y_pred)
prediksi harga rumah: [[8494462.58337712]]